In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
In [2]:
df = pd.read_csv("HR_Analytics.csv")
df.tail()
Out[2]:
In [3]:
# Change the column name from 'sales' to 'Department' because that column have same value(sales)
df=df.rename(columns={'sales':'Department'})
df.tail()
Out[3]:
In [4]:
#colum 'job' & 'salary' encoding
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['Department']= le.fit_transform(df['Department'])
#df['salary']= le.fit_transform(df['salary'])
df['salary'].replace(['low', 'medium', 'high'], [0, 1, 2], inplace = True)
df.tail()
Out[4]:
In [22]:
sns.pairplot(df)
plt.show()
In [23]:
df1 = df.drop('left',1)
corr_data = pd.DataFrame.corr(df1)
sns.heatmap(corr_data, annot=True, linewidths=1)
plt.show()
In [19]:
from sklearn.ensemble import ExtraTreesClassifier
X = df.drop('left', 1)
y = df['left']
forest = ExtraTreesClassifier()
forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
In [16]:
sns.barplot(x = 'left', y = 'satisfaction_level', data = df)
sns.plt.title('Left over satisfaction_level')
#sns.barplot(df['left'], df['satisfaction_level'])
plt.show()
In [41]:
facet = sns.FacetGrid(df, hue="left",aspect=2)
facet = (facet.map(sns.kdeplot,'satisfaction_level', shade = True).add_legend())
plt.show()
In [11]:
sns.barplot(x = 'number_project', y = 'left', data = df)
sns.plt.title('Left over Number of project')
plt.show()
In [13]:
facet = sns.FacetGrid(df, hue="left",aspect=2)
facet = (facet.map(sns.kdeplot,'number_project', shade = True).add_legend())
plt.show()
In [20]:
sns.barplot(x = 'time_spend_company', y = 'left', data = df)
sns.plt.title('Left over time_spend_company')
#sns.barplot(df['left'], df['satisfaction_level'])
plt.show()
In [19]:
facet = sns.FacetGrid(df, hue="left",aspect=2)
facet = (facet.map(sns.kdeplot,'time_spend_company', shade = True).add_legend())
plt.show()
In [33]:
facet = sns.FacetGrid(df, hue="left",aspect=2)
facet = (facet.map(sns.kdeplot,'last_evaluation', shade = True).add_legend())
plt.show()
In [34]:
sns.countplot(x='promotion_last_5years', hue="left", data = df)
sns.plt.title('Left over promotion_last_5years')
plt.show()
In [ ]:
In [37]:
from sklearn.model_selection import train_test_split
y = df['left']
X = df.drop('left',1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state= 42 )
In [38]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
logistic_score_train = logistic.score(X_train, y_train)
print "Train Score : ", logistic_score_train
logistic_score_test = logistic.score(X_test, y_test)
print "-" * 40
print "Test Score : ", logistic_score_test
In [39]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
svm_score_train = svm.score(X_train, y_train)
print "Train Score : ", svm_score_train
svm_score_test = svm.score(X_test, y_test)
print "-" * 40
print "Test Score : ", svm_score_test
In [40]:
from sklearn import tree
decision = tree.DecisionTreeClassifier()
decision.fit(X_train, y_train)
decision_score_train = decision.score(X_train, y_train)
print "Train score : ",decision_score_train
decision_score_test = decision.score(X_test, y_test)
print "-" * 40
print "Test score : ",decision_score_test
In [49]:
from sklearn.ensemble import RandomForestClassifier
random = RandomForestClassifier()
random.fit(X_train, y_train)
random_score_train = random.score(X_train, y_train)
print "Training score : ",random_score_train
random_score_test = random.score(X_test, y_test)
print "-" * 40
print "Testing score : ",random_score_test
In [45]:
models = pd.DataFrame({
'Model' : ['Logistic Regression', 'SVM', 'Decision Tree', 'Random Forest'],
'Train_Score' : [logistic_score_train, svm_score_train, decision_score_train, random_score_train],
'Test_Score' : [logistic_score_test, svm_score_test, decision_score_test, random_score_test]
})
models.sort_values(by='Test_Score', ascending=True)
Out[45]:
In [ ]:
In [ ]: